---
title: "Analyse der Matoma-HaNS-Daten"
toc: true
number-sections: true
format:
html:
theme: lumen
embed-resources: true
toc: true
toc-location: right
toc-depth: 3
number-sections: true
code-fold: true
code-summary: "Show the code"
code-tools: true
execute:
warning: false
cache: true
---
# Setup
## R-Pakete starten
```{r load-libs}
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)
```
```{r}
theme_set (theme_minimal ())
```
## Roh-Daten laden und inspizieren
JSON-Daten wurden nicht importiert, da offenbar nur redundante Daten enthalten sind.
```{r load-data-all-f}
tar_load(data_all_fct)
```
### Dimension
Der Roh-Datensatz verfügt über
- `r nrow(data_all_fct)` Zeilen
- `r ncol(data_all_fct)` Spalten (Dubletten und Spalten mit Bildern bereits entfernt)
Jede Zeile entspricht einem "Visit".
## Datensatz nur User
Entfernt man Developer, Admins und Lecturers aus dem Roh-Datensatz so bleiben weniger Zeilen übrig:
```{r load-data-users-only}
tar_load(data_users_only)
```
- `r nrow(data_users_only)` Zeilen
- `r ncol(data_users_only)` Spalten
## Datensatz mit Anzahl der Aktionen pro User
```{r load-count-action}
tar_load(count_action)
```
## Zeitraum (Beginn, Ende) der Daten
```{r}
tar_load (paths)
```
Laut `config.yaml` ist das aktuelle Semester `r paths$this_semester` .
```{r}
tar_load (time_minmax)
```
```{r}
time_minmax |>
summarise (time_min = min (time_min),
time_max = max (time_max)) |>
gt ()
```
Diese Statistik wurde auf Basis des Datenobjekts `data_slim` berechnet.
## Statistiken
### Mit den 499er-Daten
```{r}
count_action |>
describe_distribution (n_max) |>
gt () |>
fmt_number (columns = where (is.numeric),
decimals = 2 )
```
### Ohne die 499er-Daten
```{r}
count_action2 <-
count_action |>
filter (n_max != 499 )
count_action2 |>
describe_distribution (n_max) |>
gt () |>
fmt_number (columns = where (is.numeric),
decimals = 2 )
```
## Verteilung
### Mit den 499er-Daten
```{r plot-count-action}
count_action_avg = mean(count_action$n_max)
count_action_sd = sd(count_action$n_max)
count_action |>
ggplot() +
geom_histogram(aes(x = n_max)) +
labs(x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
theme_minimal() +
geom_vline(xintercept = count_action_avg,
color = palette_okabe_ito()[1]) +
geom_segment(x = count_action_avg-count_action_sd,
y = 0,
xend = count_action_avg + count_action_sd,
yend = 0,
color = palette_okabe_ito()[2],
size = 2) +
annotate("label", x = count_action_avg, y = 1500, label = "MW") +
annotate("label", x = count_action_avg + count_action_sd, y = 0, label = "SD")
#geom_label(aes(x = count_action_avg), y = 1, label = "Mean")
```
- Mittelwert der Aktionen pro Visit: `r round(count_action_avg, 2)` .
- SD der Aktionen pro Visit: `r round(count_action_sd, 2)` .
### Ohne 499er-Daten
```{r plot-count-action-2}
count_action_avg2 = mean(count_action2$n_max)
count_action_sd2 = sd(count_action2$n_max)
count_action2 |>
ggplot() +
geom_histogram(aes(x = n_max)) +
labs(x = "Anzahl von Aktionen pro Visit",
y = "n",
title = "Verteilung der User-Aktionen pro Visit",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
theme_minimal() +
geom_vline(xintercept = count_action_avg2,
color = palette_okabe_ito()[1]) +
geom_segment(x = count_action_avg-count_action_sd2,
y = 0,
xend = count_action_avg2 + count_action_sd2,
yend = 0,
color = palette_okabe_ito()[2],
size = 2) +
annotate("label", x = count_action_avg2, y = 1500, label = "MW", vjust = "top") +
annotate("label", x = count_action_avg2 + count_action_sd2, y = 0, label = "SD", vjust = "bottom")
#geom_label(aes(x = count_action_avg), y = 1, label = "Mean")
```
- Mittelwert der Aktionen pro Visit: `r round(count_action_avg2, 2)` .
- SD der Aktionen pro Visit: `r round(count_action_sd2, 2)` .
# Zeit pro Visit
Die Visit-Zeit wurde auf 600 Min. begrenzt.
```{r load-time-spent}
tar_load(time_spent)
```
```{r}
time_spent <-
time_spent |>
mutate (t_min = as.numeric (time_diff, units = "mins" )) |>
filter (t_min < 600 )
```
## Verweildauer-Statistiken in Sekunden
```{r comp-diff-time-stats}
time_spent |>
summarise(
mean_time_diff = round(mean(time_diff), 2),
sd_time_diff = sd(time_diff),
min_time_diff = min(time_diff),
max_time_diff = max(time_diff)
) |>
gt() |>
fmt_number(columns = everything(),
decimals = 2)
```
## Verweildauer-Statistiken in Minuten
```{r}
time_spent |>
summarise (
mean_t_min = mean (t_min),
sd_t_min = sd (t_min),
min_t_min = min (t_min),
max_t_min = max (t_min)
) |>
gt () |>
fmt_number (columns = everything (),
decimals = 2 )
```
## Visualisierung der Verweildauer
### bins=20
```{r plot-time-spent1}
time_spent |>
ggplot(aes(x = t_min)) +
geom_histogram() +
scale_x_time() +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in HaNS pro Visit in Minuten")
```
### bins=100
```{r plot-time-spent2}
time_spent |>
ggplot(aes(x = t_min)) +
geom_histogram(binwidth = 5) +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in Minuten",
title = "Verweildauer in HaNS pro Visit",
caption = "binwidth = 5 Min.")
```
### Zeitdauer begrenzt auf 1-120 Min.
```{r plot-time-spent3}
time_spent2 <-
time_spent |>
filter(t_min > 1, t_min < 120)
time_spent2 |>
ggplot(aes(x = t_min)) +
geom_histogram(binwidth = 10) +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in HaNS pro Visit in Minuten",
title = "Verweildauer begrenzt auf 1-120 Minuten",
caption = "bindwidth = 10 Min.")
```
# Was machen die User?
```{r tar-load-count-action-type}
tar_load(count_action_type)
```
## Statistiken
```{r}
count_action_type |>
count (category, sort = TRUE ) |>
gt ()
```
## Verteilung
### Rohwerte
```{r vis-count-action-type}
count_action_type |>
count(category, sort = TRUE) |>
ggplot(aes(y = reorder(category, n), x = n)) +
geom_col() +
geom_bar_text() +
labs(
x = "User-Aktion",
y = "Aktion",
title = "Anzahl der User-Aktionen nach Kategorie"
) +
theme_minimal() +
scale_x_continuous(labels = scales::comma)
```
### Log-Skalierung
```{r vis-count-action-type-log}
#| fig-width: 9
count_action_type |>
count(category, sort = TRUE) |>
ggplot(aes(y = reorder(category, n), x = n)) +
geom_col() +
geom_bar_text() +
labs(
x = "Anazhl der User-Aktionen",
y = "Aktion",
title = "Anzahl der User-Aktionen nach Kategorie",
caption = "Log10-Skala"
) +
theme_minimal() +
scale_x_log10()
```
# An welchen Tagen und zu welcher Zeit kommen die User zu HaNS?
## Setup
```{r}
tar_load (time_visit_wday)
```
```{r}
# Define a vector with the names of the days of the week
# Note: Adjust the start of the week (Sunday or Monday) as per your requirement
days_of_week <- c ("Monday" , "Tuesday" , "Wednesday" , "Thursday" , "Friday" , "Saturday" , "Sunday" )
# Replace numbers with day names
time_visit_wday$ dow2 <- factor (days_of_week[time_visit_wday$ dow],
levels = days_of_week)
```
## HaNS-Login nach Uhrzeit
```{r vis-hans-login-hour}
time_visit_wday |>
as_tibble() |>
count(hour) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "HaNS-Nutzer sind keine Frühaufsteher",
x = "Uhrzeit",
y = "Anteil"
)
# coord_polar()
```
```{r vis-hans-login-hour-polar}
time_visit_wday |>
as_tibble() |>
count(hour) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
theme_minimal() +
coord_polar()
```
## Verteilung der HaNS-Besuche nach Wochentagen
```{r vis-hans-login-wday-bar}
time_visit_wday |>
as_tibble() |>
count(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil")
# coord_polar()
```
```{r vis-hans-login-wday-polar}
time_visit_wday |>
as_tibble() |>
count(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil") +
coord_polar()
```
### HaNS-Login nach Wochentagen Uhrzeit
```{r vis-hans-login-wday-hour}
time_visit_wday |>
as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~ dow2) +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil")
# coord_polar()
```
```{r vis-hans-login-wday-hour-polar}
#| fig-width: 9
#| fig-asp: 1.5
time_visit_wday |>
as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~ dow2) +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil") +
coord_polar()
```
## Anzahl der Visits nach Datum (Tagen) und Uhrzeit (bin2d)
```{r}
time2 <-
time_visit_wday |>
ungroup () |>
mutate (date = as.Date (date_time))
time2 |>
ggplot (aes (x = date, y = hour)) +
geom_bin2d (binwidth = c (1 , 1 )) + # (1 day, 1 hour)
scale_x_date (date_breaks = "1 month" ) +
theme (legend.position = "bottom" ) +
scale_fill_viridis_c () +
labs (caption = "Each x-bin maps to one week" )
```
## Anzahl der Visits nach Datum (Wochen) und Uhrzeit (bin2d)
```{r}
time2 |>
ggplot (aes (x = date, y = hour)) +
geom_bin2d (binwidth = c (7 , 1 )) + # 1 week, 1 hour
scale_x_date (date_breaks = "1 week" , date_labels = "%W" ) +
theme (legend.position = "bottom" ) +
scale_fill_viridis_c () +
labs (x = "Week number in 2023/2024" ,
caption = "Each x-bin maps to one week" )
```
## Anzahl der Visits nach Datum (Wochen) und Wochentag (bin2d)
```{r}
time2 |>
ggplot (aes (x = date, y = dow)) +
geom_bin2d (binwidth = c (7 , 1 )) + # 1 week, 1 hour
scale_x_date (date_breaks = "1 week" , date_labels = "%W" ) +
theme (legend.position = "bottom" ) +
scale_fill_viridis_c () +
labs (x = "Week number in 2023/2024" ,
caption = "Each x-bin maps to one week" ,
y = "Day of Week" ) +
scale_y_continuous (breaks = 1 : 7 )
```
# KI-Gebrauch
## Welcher Anteil der Nutzenden klickt auf ein Wort im Transkript?
```{r}
tar_load (data_slim)
```
```{r}
data_slim |>
filter (type == "subtitle" ) |>
filter (! is.na (value) & value != "" ) |>
count (click_transcript_word = str_detect (value, "click_transcript_word" )) |>
mutate (prop = n/ sum (n)) |>
gt ()
```
## ... Aufteilung nach Monaten
```{r}
tar_load (ai_transcript_clicks_per_month)
```
```{r}
ai_transcript_clicks_per_month |>
gt ()
```